In the new york city There are a lot of people using Uber and a taxi to get around the city and go to work, companies need to know the movement of people to deal with the time and how many order in the day, To know the traffic and to increase the company's profits and serve customers better and right in time.
# import required library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import geopandas
sns.set_style('whitegrid')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
# read csv file
uber = pd.read_csv('uber-pickups-in-new-york-city/uber-raw-data-apr14.csv')
# see top 5
uber.head()
#there is 564516 rows and 4 columns
uber.shape
# info of data
uber.info()
# how many null value in data
uber.isnull().sum()
# plot the data to see null value
plt.figure(figsize=(9,9))
sns.heatmap(uber.isnull(), annot=False,cbar=False,yticklabels=False,cmap='viridis')
plt.show()
#convert date/time to timestamp
uber['Date/Time'] = pd.to_datetime(uber['Date/Time'] ,format='%m/%d/%Y %H:%M:%S')
uber.info()
# change column name
uber = uber.rename(columns={'Date/Time':'date_time','Lat':'lat', 'Lon':'lon','Base':'base'})
uber.head()
# print the date info
print('day',uber.date_time[0])
print('day',uber.date_time[0].day)
print('month',uber.date_time[0].month)
print('year',uber.date_time[0].year)
print('second',uber.date_time[0].second)
print('minute',uber.date_time[0].minute)
print('hour',uber.date_time[0].hour)
print('day of week',uber.date_time[0].dayofweek)
# add new columns
uber['day_of_month'] = uber.date_time.dt.day
uber['month'] = uber.date_time.dt.month
uber['year'] = uber.date_time.dt.year
uber['second'] = uber.date_time.dt.second
uber['minute'] = uber.date_time.dt.minute
uber['hour'] = uber.date_time.dt.hour
uber['day_of_week'] = uber.date_time.dt.dayofweek
uber['name_of_day'] = uber.date_time.dt.weekday_name
uber.head(10)
# plot pick up in the week day
uber_weekdays = uber.pivot_table(index=['name_of_day','day_of_week'],
values='base',
aggfunc='count').sort_values('day_of_week')
uber_weekdays.plot(kind='bar', figsize=(14,10),legend=False)
plt.ylabel('Number of pick up')
plt.title('Uber Pick Up in Week Day',size = 25);
Looking at the data we have, we note that the days of the middle of the week are the most requested, and at the weekend the orders decrease
# plot box plot to see number of pick up in hour
plt.figure(figsize=(16,12))
plt.title('Pick up in hours',size=25)
sns.boxplot(x='name_of_day',y='hour',data=uber, palette=['deepskyblue','palegreen','sandybrown'],saturation=.6)
plt.xlabel('Hour',size=20)
plt.ylabel('Name of day',size=20)
plt.show()
In this graph, we notice that the most time to order is between 10 and 18 o'clock
# plot traffic in week day
sns.set(style="darkgrid")
plt.figure(figsize=(15,8))
plt.title('Traffic in week day',size=25)
ax = sns.countplot(x="day_of_month", data=uber,color='blue')
Looking at the data for the month, we notice that there has been an increase and expansion for a number of Uber pick up
# plot traffic in day hour
sns.set(style="darkgrid")
plt.figure(figsize=(15,8))
plt.title('Traffic in day hour',size=25)
ax = sns.countplot(x="hour", data=uber,color='blue')
In this graph, we notice that the most time to order is between 14 and 22 o'clock
# set index date time
uber.set_index('date_time')
import plotly.express as px
fig = px.scatter_mapbox(uber, lat="lat", lon="lon", hover_name="date_time",
color_discrete_sequence=["fuchsia"], zoom=8, height=600)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()